In [69]:
    
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from pandas.tools.plotting import scatter_matrix
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
    
In [2]:
    
data = np.loadtxt('pca-data-2d.dat', delimiter=' ', skiprows=0, usecols=range(0, 2))
data.shape
    
    Out[2]:
In [3]:
    
data
    
    Out[3]:
In [4]:
    
m = np.mean(data, 0)
m
    
    Out[4]:
In [5]:
    
data_centered = np.subtract(data, m)
data_centered
    
    Out[5]:
In [6]:
    
plt.figure()
plt.scatter(data_centered.T[0], data_centered.T[1])
plt.xlabel("col1")
plt.ylabel("col2")
plt.grid()
plt.show()
    
    
In [7]:
    
covariance = np.cov(data_centered.T)
covariance
    
    Out[7]:
In [8]:
    
evals, evecs = np.linalg.eig(covariance)
transmat = evecs.T
transmat
    
    Out[8]:
In [9]:
    
evec1 = transmat[0] #evecs[:, 0]
evec2 = transmat[1] #evecs[:, 1]
evec1
    
    Out[9]:
In [10]:
    
data_trans = np.array([[0.0, 0.0] for i in range(len(data))])
for i in range(len(data_centered)):
    data_trans[i] = np.dot(transmat, data_centered[i])
data_trans
    
    Out[10]:
In [11]:
    
plt.figure(figsize=(10, 10))
plt.scatter(data_centered.T[0], data_centered.T[1])
plt.plot([0, evec1[0]], [0, evec1[1]])
plt.plot([0, evec2[0]], [0, evec2[1]])
plt.scatter(data_trans.T[0], data_trans.T[1])
plt.grid()
plt.show()
    
    
In [12]:
    
transmat_inv = np.linalg.inv(transmat)
data_trans_inv = np.array([[0.0, 0.0] for i in range(len(data))])
for i in range(len(data)):
    data_trans_inv[i] = np.dot(transmat_inv, data_trans[i])
data_trans_inv
data_trans_PC1 = np.copy(data_trans)
data_trans_PC1[:, 1] = 0
data_trans_inv_PC1 = np.array([[0.0, 0.0] for i in range(len(data))])
for i in range(len(data)):
    data_trans_inv_PC1[i] = np.dot(transmat_inv, data_trans_PC1[i])
data_trans_PC2 = np.copy(data_trans)
data_trans_PC2[:, 0] = 0
data_trans_inv_PC2 = np.array([[0.0, 0.0] for i in range(len(data))])
for i in range(len(data)):
    data_trans_inv_PC2[i] = np.dot(transmat_inv, data_trans_PC2[i])
data_trans_PC2
    
    Out[12]:
In [13]:
    
plt.figure(figsize=(10, 10))
plt.scatter(data_centered.T[0], data_centered.T[1])
plt.scatter(data_trans_inv_PC1.T[0], data_trans_inv_PC1.T[1])
plt.scatter(data_trans_inv_PC2.T[0], data_trans_inv_PC2.T[1])
red_patch = mpatches.Patch(color='red', label='full data')
blue_patch = mpatches.Patch(color='blue', label='only PC1')
green_patch = mpatches.Patch(color='green', label='only PC2')
plt.legend(handles=[red_patch, blue_patch, green_patch])
plt.grid()
plt.show()
    
    
In [14]:
    
data3d = np.loadtxt('pca-data-3d.txt', delimiter=',', skiprows=1)
data3d.shape # 3 axis 500 points
    
    Out[14]:
In [15]:
    
mean3d = np.mean(data3d, 0)
data3d_centered = np.subtract(data3d, mean3d)
mean3d
    
    Out[15]:
In [18]:
    
fig, axs = plt.subplots(3, 3, figsize=(9, 9))
for i in range(3):
    for j in range(3):
        axs[i][j].scatter(data3d_centered[:, i], data3d_centered[:, j])
        plt.tight_layout(1)
        axs[i][j].set_xlabel('col {}'.format(i+1))
        axs[i][j].set_ylabel('col {}'.format(j+1))
fig.suptitle('Pairwise scatter plots of columns (x, y, z)', y=1.05, fontsize=16)
plt.show()
    
    
In [19]:
    
covariance3d = np.cov(data3d_centered.T)
evals3d, evecs3d = np.linalg.eig(covariance3d)
transmat3d = evecs3d.T
covariance3d
transmat3d
evals3d
# => Z is PC1 // Y is PC2 // X is PC3
    
    Out[19]:
In [22]:
    
data3d_trans = np.array([[0.0, 0.0, 0.0] for i in range(len(data3d))])
for i in range(len(data3d_centered)):
    data3d_trans[i] = np.dot(transmat3d, data3d_centered[i])
fig, axs = plt.subplots(3, 3, figsize=(9, 9))
for i in range(3):
    for j in range(3):
        axs[i][j].scatter(data3d_trans[:, i], data3d_trans[:, j])
        plt.tight_layout(1)
        axs[i][j].set_xlabel('col {}'.format(i+1))
        axs[i][j].set_ylabel('col {}'.format(j+1))
fig.suptitle('Pairwise scatter plots of columns (PC1, PC2, PC3)', y=1.05, fontsize=16)
plt.show()
    
    
In [23]:
    
transmat3d_inv = np.linalg.inv(transmat3d)
data3d_trans_PC1 = np.copy(data3d_trans)
data3d_trans_PC1[:, 0] = 0
data3d_trans_PC1[:, 1] = 0
data3d_trans_PC1_recov = np.array([[0.0, 0.0, 0.0] for i in range(len(data3d))])
for i in range(len(data3d)):
    data3d_trans_PC1_recov[i] = np.dot(transmat3d_inv, data3d_trans_PC1[i])
data3d_trans_PC12 = np.copy(data3d_trans)
data3d_trans_PC12[:, 0] = 0
data3d_trans_PC12_recov = np.array([[0.0, 0.0, 0.0] for i in range(len(data3d))])
for i in range(len(data3d)):
    data3d_trans_PC12_recov[i] = np.dot(transmat3d_inv, data3d_trans_PC12[i])
data3d_trans_PC123_recov = np.array([[0.0, 0.0, 0.0] for i in range(len(data3d))])
for i in range(len(data3d)):
    data3d_trans_PC123_recov[i] = np.dot(transmat3d_inv, data3d_trans[i])
data3d_trans_PC12[0, :]
    
    Out[23]:
In [24]:
    
plt.figure(figsize=(10, 10))
plt.scatter(data3d_trans_PC123_recov.T[0], data3d_trans_PC123_recov.T[1])
plt.scatter(data3d_trans_PC12_recov.T[0], data3d_trans_PC12_recov.T[1])
plt.scatter(data3d_trans_PC1_recov.T[0], data3d_trans_PC1_recov.T[1])
blue_patch = mpatches.Patch(color='blue', label='PC123')
red_patch = mpatches.Patch(color='red', label='PC12')
green_patch = mpatches.Patch(color='green', label='PC1')
plt.legend(handles=[blue_patch, red_patch, green_patch])
plt.title('Scatter plot from x-y-layer')
plt.grid()
plt.show()
    
    
In [25]:
    
fig = plt.figure(figsize=(11, 11))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(data3d_trans_PC123_recov[:, 0], data3d_trans_PC123_recov[:, 1], data3d_trans_PC123_recov[:, 2])
ax.scatter(data3d_trans_PC12_recov[:, 0], data3d_trans_PC12_recov[:, 1], data3d_trans_PC12_recov[:, 2])
ax.scatter(data3d_trans_PC1_recov[:, 0], data3d_trans_PC1_recov[:, 1], data3d_trans_PC1_recov[:, 2])
blue_patch = mpatches.Patch(color='blue', label='PC123')
red_patch = mpatches.Patch(color='red', label='PC12')
green_patch = mpatches.Patch(color='green', label='PC1')
plt.legend(handles=[blue_patch, red_patch, green_patch])
plt.title('Recovered data points')
plt.show()
    
    
Using only the first PC is too little information to near or compress the data. Using the first two PCs similars the original data quite well.
In [26]:
    
data = np.loadtxt('expDat.txt', delimiter=',', skiprows=1, usecols=range(1, 21))
data.shape, data
    
    Out[26]:
In [28]:
    
data_centered = data - data.mean(axis=0)
data_centered
    
    Out[28]:
In [30]:
    
covariance = np.cov(data_centered.T)
covariance.shape
    
    Out[30]:
In [41]:
    
evals, evecs = np.linalg.eig(covariance)
evecs.T
    
    Out[41]:
In [ ]:
    
    
In [92]:
    
from scipy.ndimage import imread
import os
    
In [95]:
    
n_patches = []
b_patches = []
for img_name in os.listdir('imgpca'):
    img = imread(os.path.join('imgpca', img_name))
    for i in range(500):
        x = np.random.randint(img.shape[0] - 16)
        y = np.random.randint(img.shape[1] - 16)
        patch = img[x:x+16, y:y+16].flatten()
        if img_name.startswith('n'):
            n_patches.append(patch)
        elif img_name.startswith('b'):
            b_patches.append(patch)
        
n_patches = np.array(n_patches)
b_patches = np.array(b_patches)
n_patches.shape, b_patches.shape
    
    Out[95]:
In [96]:
    
# Show some nature patches.
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for ax in axes.flatten():
    plt.sca(ax)
    plt.imshow(n_patches[np.random.randint(len(n_patches))].reshape(16, 16), cmap='Greys', interpolation=None)
    plt.axis('off')
    
    
In [97]:
    
# Show some building patches.
fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for ax in axes.flatten():
    plt.sca(ax)
    plt.imshow(b_patches[np.random.randint(len(b_patches))].reshape(16, 16), cmap='Greys', interpolation=None)
    plt.axis('off')
    
    
In [99]:
    
n_patches_centered = n_patches - n_patches.mean(axis=0)
b_patches_centered = b_patches - b_patches.mean(axis=0)
    
In [100]:
    
n_covariance = np.cov(n_patches_centered.T)
b_covariance = np.cov(b_patches_centered.T)
n_covariance.shape, b_covariance.shape
    
    Out[100]:
In [101]:
    
n_evals, n_evecs = np.linalg.eig(n_covariance)
b_evals, b_evecs = np.linalg.eig(b_covariance)
n_evecs.T.shape, b_evecs.T.shape
    
    Out[101]:
In [103]:
    
# Nature PCAs.
fig, axes = plt.subplots(3, 4, figsize=(15, 10))
for i, ax in enumerate(axes.flatten()):
    plt.sca(ax)
    plt.imshow(n_evecs.T[i].reshape(16, 16), cmap='Greys', interpolation=None)
    plt.axis('off')
    
    
In [104]:
    
# Building PCAs.
fig, axes = plt.subplots(3, 4, figsize=(15, 10))
for i, ax in enumerate(axes.flatten()):
    plt.sca(ax)
    plt.imshow(b_evecs.T[i].reshape(16, 16), cmap='Greys', interpolation=None)
    plt.axis('off')
    
    
The first few PCAs from building and nature images are similar, they represent basic shades and edges (first rows in the plots above). However, the PCAs from the second and third rows above seem different - for buildings, the lines are edgy and straight, while for nature images, they seem to have more natural shapes.
In [135]:
    
plt.plot(n_evals[:100], '.', label='nature')
plt.plot(b_evals[:100], '.', label='buildings')
plt.ylabel('Eigenvalue')
plt.xlabel('PC')
plt.legend()
plt.ylim(0, 80000)
    
    Out[135]:
    
For simplicity, only the first 100 PCs are plotted and the first PC is not shown due to its high eigenvalue. For both image categories, one should keep around 20 PCs according to the Scree test. This represents a compression of 1 - (20/256) = 92 %.
In [ ]: